* This do files cleans the 1931 Ghana Census entries
* Input files: Census 1931.xls
* Output files: Census 1931.dta, census1931_grid.dta
* Author: Alexander Moradi

****************************************************************************
* Contents
* 1. Destring variables, Cleaning of Census 1931 dataset
* 2. Prepare georeferenced sources 
* 3. Matching between GEONet, 1908 places and Census 1931: Unique matches
* This results in a 93% match

// Ghana's Total pop: 3,065,945
// Ghana's N localities: 12,612

****************************************************************************
clear

tempfile complete matched places1908 places


***************************************************************
*** 1. Destring variables, Cleaning of Census 1931 dataset ***
***************************************************************

import excel "Data\Census 1931.xls", sheet("Form A (Villages)") firstrow case(lower) 

* Men
replace n_c="0" if n_c=="-"
destring n_c, replace

replace m15="0" if m15=="-"
destring m15, replace

replace m15_45="0" if m15_45=="-"
destring m15_45, replace

replace m46="0" if m46=="-"
destring m46, replace

* Women
replace f15="0" if f15=="-"
destring f15, replace

replace f15_45="0" if f15_45=="-"
destring f15_45, replace

replace f46="0" if f46=="-"
destring f46, replace

replace total="0" if total=="-"
destring total, replace dpcomma

* Infirmities
replace edu="0" if edu=="-"
destring edu, replace

replace lepers="0" if lepers=="-"
destring lepers, replace

replace blind="0" if blind=="-"
destring blind, replace

replace deaf="0" if deaf=="-"
destring deaf, replace 

replace mental="0" if mental=="-"
destring mental, replace

gen plc_census=town
move plc_census town

* District Variable: Remove "District" in name of region
replace region= subinstr(region," District","",.) if strpos(region,"District")>1

label var plc_census "Original name of locality Census 1931"
label var region "Region (Census 1931)"
sort region division town total

replace town=lower(town)
replace town=subinstr(town," ","",.)
replace town="Jerusalem" if town=="Jeruselem"
replace town=regexr(town, "IV", "Number 4")
replace town=regexr(town, "III", "Number 3")
replace town=regexr(town, "II", "Number 2")
replace town=regexr(town, "I", "Number 1")
sort town

rename town town_census
rename region region_census
rename division division_census
rename id id_census
gen town31=town
gen region31=region
gen division31=division

drop division31 town31
save "Data\Census 1931.dta", replace





*****************************************
*** 2. Prepare georeferenced sources ***
*****************************************

* 2a) 1908 Gazetter
import dbase using "Data\places1908.dbf", case(lower) clear
drop if town==""
drop _* f1* objectid fid_1 n page see fid_2 
rename district district1908
rename division division31
rename region region31
rename town_1 cap_division31
label var cap_division31 "Capital of 1931 Census division"
rename lat_1 lat_capital_division
rename lon_1 lon_capital_division
rename descrip dsg
rename town dsg_label
drop if district1908=="Ivory Coast"
gen town=lower(name)
replace town = subinstr(town," ","",.)
replace town = subinstr(town,"(rns)","",.)
replace town = substr(town,1,strpos(town,"(")-1) if strpos(town,"(")>1
gen source=1908

save `places1908', replace

* 2b) 2019 Gazetter

*** keep populated places only : requirement was implemented in Mapinfo ***
* keep if dsg=="PPL" | dsg=="PPLA" | dsg=="PPLX"

* geonet_place31.csv is the data set with GEOnet place names and 
* a) the 1931 district in which the place is located and 
* b) the distance of the GEONet place to the 1931 district border

import excel "Data\places2019.xlsx", sheet("places2019") firstrow case(lower) clear
*** replacing upper case letters with lower case letters ***
* Variable town is the string varible that matches the dataset
replace town = subinstr(town," ","",.)
replace town=lower(town)
gen source=2019

* 2c) Merging the two sources
append using `places1908'

** Missing ufi, region, district31, division31, lat_capital_division, lon_capital_division for Aowin State
// Aowin
replace district31="Aowin" if lon<-2.40 & district31==""
replace division31="Aowin State" if lon<-2.40 & division31==""
replace cap_division31="Dadiaso" if lon<-2.40 & cap_division31==""
replace lat_capital_division=6.1167 if lon<-2.40 & district31=="Aowin"
replace lon_capital_division=-3.033 if lon<-2.40 & district31=="Aowin"
replace region="Aowin" if district31=="Aowin"
// Birim (Kwahu)
replace division31="Kwahu" if district31=="Birim (Kwahu)"
replace cap_division31="Pepease"  if district31=="Birim (Kwahu)"
replace lat_capital_division=6.7 if district31=="Birim (Kwahu)"
replace lon_capital_division=-0.7333 if district31=="Birim (Kwahu)"
replace region="Birim (Kwahu)"  if district31=="Birim (Kwahu)"

rename town town_places 
sort town district31 source
edit town district31 source
drop if town==town[_n+1] & district31==district31[_n+1] & source==1908
drop distance
gen id_places=_n

save `places', replace
*******************************************************************************
*******************************************************************************



*******************************************************************************
* 3 Matching
*******************************************************************************
use "Data\Census 1931.dta", replace
sort region31 town_census

*** Merge many to many ***
joinby region31 using `places', unmatched(both) _merge(_merge)



********************************************************************************
** Iteration 1 // Matching by division
** Unique match (10): Same name, in the same division
gen name_matches=(town_census==town_places & division_census==division31)
duplicates tag id_census division_census name_matches, gen(d1) // Duplicates in the match of census places (id_census)
duplicates tag id_places division_census name_matches, gen(d2) // Duplicates in the match of places (id_places)
gen match=10 if name_matches==1 & d1==0 & d2==0 // match only if these are UNIQUE pairs; 1 id_census matched to 1 id_place

* Duplicate matches: Same name, in the same division
// geographic coordinates less than 0.05 away, randomly choose one
sort name_m d1 id_census ufi 
drop if id_census==id_census[_n-1] & lat-lat[_n-1]<=0.05 & lon-lon[_n-1]<=0.05 & name_m==1
drop d1
duplicates tag town_census division_census name_matches, gen(d1)
replace match=10 if name_matches==1 & d1==0 & match==.

// More than 2 census localities, but only one place coordinate -> Assign the larger locality to this place 
drop d1
duplicates tag id_places division_census name_matches, gen(d1)
gsort d1 id_places -total 
replace match=21 if name_matches==1 & id_places==id_places[_n+1] & d1==1 & total>total[_n+1]

/// checked by hand (because they later cause issues with other duplicates
replace match=13 if id_census==4123 & id_places==281 // Aboabo
replace match=13 if id_census==13069 & id_places==25136 // Tamale
replace match=13 if id_census==13067 & id_places==16291 // Kumasi
replace match=13 if id_census==9297 & id_places==9186 // Chana
replace match=13 if id_census==3939 & id_places==9912 // Datoko
replace match=13 if id_census==13013 & id_places==4171 // Anamaboe
replace match=13 if id_census==13068 & id_places==7358 // 
replace match=13 if id_census==2826 & id_places==5832 // 
replace match=13 if id_census==7604 & id_places== 13999 // Juaso old town
replace match=13 if id_census==7606 & id_places== 14002 // Juaso new town
replace match=13 if id_census==8077 & id_places== 20577 // obuasinsuta
replace match=13 if id_census==8079 & id_places== 7612 // Bedsiadzi
replace match=13 if id_census==9667 & id_places== 12101 // Fwego
replace match=13 if id_census==9327 & id_places== 24423 // Sissina
replace match=13 if id_census==7992 & id_places== 12211 // Fomena
replace match=13 if id_census==8843 & id_places== 9077 // Burafoo
replace match=13 if id_census==6574 & id_places== 27488 // Burafoo
replace match=13 if id_census==2283 & id_places== 11178 // Ebiram (Upper and Lower)
replace match=13 if id_census==4185 & id_places== 26838 // Ebiram (Upper and Lower)


drop _merge d1 d2

save `complete', replace
********************************************************************************

****************
* Remove matched places from possible candidates (from id_census but also from id_places)
* Keep matched places
keep if match!=.
keep id_places id_census
save `matched'

// Drop duplicate id_places (that were assigned to id_censuses)
use `complete', replace
merge m:1 id_places using `matched'
drop if _merge==3 & match==.
drop _merge

// Drop duplicate id_census (that were assigned to id_places)
merge m:1 id_census using `matched'
drop if _merge==3 & match==.
drop _merge
****************

/// Here: No duplicate among matches


********************************************************************************
* Iteration 2 // Matching by region
** Unique match (10): Same name, in the same region

** Unique match: Same name, in the same region
gen name_matches2=(town_census==town_places & region_census==region31)
duplicates tag id_census name_matches2, gen(d1) // Duplicates in the match of census places (id_census)
duplicates tag id_places name_matches2, gen(d2) // Duplicates in the match of places (id_places)
replace match=11 if name_matches2==1 & d1==0 & d2==0 & match==. // match only if these are UNIQUE pairs; 1 id_census matched to 1 id_place

drop d1 d2


****************
* Remove matched places from possible candidates (from id_census but also from id_places)
save `complete', replace
keep if match!=.
keep id_places id_census
save `matched', replace

// Drop duplicate id_places (that were assigned to id_censuses)
use `complete', replace
merge m:1 id_places using `matched'
drop if _merge==3 & match==.
drop _merge

merge m:1 id_census using `matched'
drop if _merge==3 & match==.
drop _merge
****************

********************************************************************************
// 2 (or more) census localities assigned to the same GEonet place coordinate -> Assign the geonet place to the more populated census place 
duplicates tag id_places region_census name_matches2, gen(d3)
gsort d3 id_places -total
replace match=22 if name_matches2==1 & id_places==id_places[_n+1] & id_places!=id_places[_n-1] & d3>=1 & total>=total[_n+1] 
gsort d3 id_census
replace match=. if name_matches2==1 & id_census==id_census[_n-1] & d3>=1 


****************
* Remove matched places from possible candidates (from id_census but also from id_places)
save `complete', replace
keep if match!=.
keep id_places id_census
save `matched', replace

// Drop duplicate id_places (that were assigned to id_censuses)
use `complete', replace
merge m:1 id_places using `matched'
drop if _merge==3 & match==.
drop _merge

merge m:1 id_census using `matched'
drop if _merge==3 & match==.
drop _merge
****************


// One census locality, but more than one place coordinate -> randomly choose one
duplicates tag id_census region_census name_matches2, gen(d4)
gsort name_matches2 d4 id_census id_places
replace match=23 	if name_matches2==1 & match==. 	& (id_census==id_census[_n+1] & id_census!=id_census[_n-1]) & d4>=1 // choose the one geonet place with the lowest id_place

drop d3 d4
********************************************************************************


****************
* Remove matched places from possible candidates (from id_census but also from id_places)
save `complete', replace
keep if match!=.
keep id_places id_census
save `matched', replace

// Drop duplicate id_places (that were assigned to id_censuses)
use `complete', replace
merge m:1 id_places using `matched'
drop if _merge==3 & match==.
drop _merge

merge m:1 id_census using `matched'
drop if _merge==3 & match==.
drop _merge
****************



*****************************************************************************
**************************** Levenshtein matches ****************************

* Calculate Levenshtein distance
levenshtein town_census town_places, gen(lev_dist)
gen name_length=strlen(town_census)
gen lev_ratio=lev_dist/name_length

tempfile complete matched
*****************************************************************************
* Give priority to places in the same division
// Choose the one with the LOWEST levenshtein distance as long as smaller than a threshold
sort id_census lev_ratio
replace match=40 if division_census==division31 & lev_ratio<=0.32 & id_census!=id_census[_n-1] & match==.
// Problem: id_place matched to more than one id_census
// If so, the id_census with the lower lev_ratio will get the id_place

sort id_place lev_ratio
drop if match==40 & id_place==id_place[_n-1] // DROP Otherwise the id_census will be again matched with that id_place
*****************************************************************************


****************
* Remove matched places from possible candidates (from id_census but also from id_places)
save `complete', replace
keep if match!=.
keep id_places id_census
save `matched', replace

// Drop duplicate id_places (that were assigned to id_censuses)
use `complete', replace
merge m:1 id_places using `matched'
drop if _merge==3 & match==.
drop _merge

merge m:1 id_census using `matched'
drop if _merge==3 & match==.
drop _merge
****************

* Iteration of same division
// Choose the one with the LOWEST levenshtein distance as long as smaller than a threshold
sort id_census lev_ratio
replace match=40 if division_census==division31 & lev_ratio<=0.32 & id_census!=id_census[_n-1] & match==.
sort id_place lev_ratio
drop if match==40 & id_place==id_place[_n-1] // DROP Otherwise the id_census will be again matched with that id_place

tempfile matched complete
****************
* Remove matched places from possible candidates (from id_census but also from id_places)
save `complete', replace
keep if match!=.
keep id_places id_census
save `matched', replace

// Drop duplicate id_places (that were assigned to id_censuses)
use `complete', replace
merge m:1 id_places using `matched'
drop if _merge==3 & match==.
drop _merge

merge m:1 id_census using `matched'
drop if _merge==3 & match==.
drop _merge
****************


*****************************************************************************
* Places of the same REGION
// Choose the one with the LOWEST levenshtein distance as long as smaller than a threshold
sort id_census lev_ratio
replace match=41 if region_census==region31 & lev_ratio<=0.25 & id_census!=id_census[_n-1] & match==.
// Problem: id_place matched to more than one id_census
// If so, the id_census with the lower lev_ratio will get the id_place

sort id_place lev_ratio
drop if match==41 & id_place==id_place[_n-1] // DROP Otherwise the id_census will be again matched with that id_place


tempfile matched complete
****************
* Remove matched places from possible candidates (from id_census but also from id_places)
save `complete', replace
keep if match!=.
keep id_places id_census
save `matched', replace

// Drop duplicate id_places (that were assigned to id_censuses)
use `complete', replace
merge m:1 id_places using `matched'
drop if _merge==3 & match==.
drop _merge

merge m:1 id_census using `matched'
drop if _merge==3 & match==.
drop _merge
****************

* Iteration of same REGION
// Choose the one with the LOWEST levenshtein distance as long as smaller than a threshold
sort id_census lev_ratio
replace match=41 if region_census==region31 & lev_ratio<=0.25 & id_census!=id_census[_n-1] & match==.
sort id_place lev_ratio
drop if match==41 & id_place==id_place[_n-1] // DROP Otherwise the id_census will be again matched with that id_place

tempfile matched complete
****************
* Remove matched places from possible candidates (from id_census but also from id_places)
save `complete', replace
keep if match!=.
keep id_places id_census
save `matched', replace

// Drop duplicate id_places (that were assigned to id_censuses)
use `complete', replace
merge m:1 id_places using `matched'
drop if _merge==3 & match==.
drop _merge

merge m:1 id_census using `matched'
drop if _merge==3 & match==.
drop _merge
****************



* Final Check for duplicates
gen matched=1 if match!=.
duplicates tag id_census matched, gen(jjj1) // Duplicates in the match of census places (id_census)
duplicates tag id_places matched, gen(jjj2) // Duplicates in the match of places (places)
tab jjj1 jjj2 if match!=.

tempfile matched
************* Preparing final data set
* Keep matched
keep if match!=.
keep id_places id_census match
duplicates drop id_census match, force // There should be no duplicates left
save `matched', replace

use "Census 1931.dta", replace
merge 1:1 id_census using `matched'
drop _merge

merge m:1 id_places using "C:\Users\am401\Dropbox\D_Drive\Sources\Ghana\Censuses\Census 1931\places.dta"
drop if _merge==2 // drop all geonet places that were not matched
drop _merge

** Not yet final dataset
* 1. check success with admin division capitals
* 2. Add admin division capitals ot the known locations


* Check of success
gen town=town_census
gen division=division_census
merge m:1 division town using "Data\admin_divisions_1931.dta"
gen admin_capital=(_merge==3)
keep if _merge!=2
drop _merge
tab match admin_capital // Not matched
// => 83% matched

* Is it the correct match?
gen true=1 if abs(lat-lat_capital)<0.02 & abs(lon-lon_capital)<0.02 & admin_capital==1
replace true=0 if admin_capital==1 & true==. & match!=.
// 6 cases wrongly matched

replace lon=lon_capital_division if match==. | true==0
replace lat=lat_capital_division if match==. | true==0
replace match=100 if admin_capital==1



************************************************
** 
label define match 100 "Division Centre"

label define match 23 "Duplicate match (same name & region, duplicate place locations, random assignment)", modify
label define match 22 "Duplicate match (same name & region, duplicate census locations, larger locality)", modify
label define match 21 "Duplicate match (same name & division, duplicate census locations, larger locality)", modify

label define match 13 "Duplicates, match by hand", modify
label define match 11 "Unique match (same name & region, no duplicates)", modify
label define match 10 "Unique match (same name & division, no duplicates)", modify

label define match 41 "Levenshtein match, same region", modify
label define match 40 "Levenshtein match, same division", modify

label define match 1 "No match", modify
label values match match

* 4,000-4027  means that the Levenshtein match was done within the same division
* 4100 - 4127 means that the Levenshtein match was done within the same district
* the odd numbers in between indicate the ratio of levensthein distance/ length of place name. I started with 1/4=0.25 - meaning I did a match when 1 out of 4 changes would result in a match. I then added 0.03, 0.06. etc. So the higher the number, the more likely the match is wrong.
* This means, you have one dataset and can select based on the match. With increasing levenshtein ratio, errors should increase e.g. 4027 means that more than every second letter was changed so to arrive at a match (but place is within the same division.

gen matched=(match!=.)
tabstat total education lepers blindness deafanddumb mentally_deranged, statistics( sum ) by(matched)

gen goodmatch=(match<400)
gen goodmatch2=(match<400 | match==4000 | match==4100)

egen districtfe=group(region_census)

gen lntotal=ln(total)
gen lneducation=education+1
replace lneducation=ln(lneducation)


areg matched lntotal lneducation, absorb(districtfe)
outreg2 using "check_measurement error.xls", excel replace
areg goodmatch lntotal lneducation, absorb(districtfe)
outreg2 using "check_measurement error.xls", excel append
areg goodmatch2 lntotal lneducation, absorb(districtfe)
outreg2 using "check_measurement error.xls", excel append


areg matched lntotal lneducation if total>=750, absorb(districtfe)
outreg2 using "check_measurement error.xls", excel  addtext(only places>=750) append
areg goodmatch lntotal lneducation if total>=750, absorb(districtfe)
outreg2 using "check_measurement error.xls", excel  addtext(only places>=750) append
areg goodmatch2 lntotal lneducation if total>=750, absorb(districtfe)
outreg2 using "check_measurement error.xls", excel  addtext(only places>=750) append


areg matched lntotal lneducation if total>=1000, absorb(districtfe)
outreg2 using "check_measurement error.xls", excel  addtext(only places>=1000) append
areg goodmatch lntotal lneducation if total>=1000, absorb(districtfe)
outreg2 using "check_measurement error.xls", excel  addtext(only places>=1000) append
areg goodmatch2 lntotal lneducation if total>=1000, absorb(districtfe)
outreg2 using "check_measurement error.xls", excel  addtext(only places>=1000) append











































**********************************************************
** Aggregate the 1931 Ghana Census by grid
**********************************************************

** Edcuation and Infirmities was not collected for Towns (Form B). These should be exluded from the analysis
drop if town_formB==1

collapse (sum) n_compounds m15 m15_45 m46 f15 f15_45 f46 education lepers blindness deafanddumb mentally_deranged total (count) n_places=total (max) largest_town=total, by (gridcell)
drop if gridcell==""

rename deaf deaf
rename mental mental

label var n_places "Number of localities"

** 1. Education acquired at the age of 6-15, but should measure the stock, not the flow
* => Keep age groups 0-46 male and female in the denominator should be 
gen education_cap=education/(m15 + m15_45 + f15 + f15_45)

* 2. Infirmities, probably an issue with old people??? 
foreach var of varlist lepers blindness deaf mental {
gen `var'_cap=`var'/(m15 + m15_45 + m46 + f15 + f15_45 + f46)
label var `var'_cap "`var' per population" 
}

** 3. Number of compounds can be used as a measure of welfare => the poorer, the more crowded?
gen pop_compound=(m15 + m15_45 + m46 + f15 + f15_45 + f46)/n_compounds
label var pop_compound "People per compound"


** 4. Measure of migration and labour scarcity 
gen male2female =(m15_45)/(f15_45)
label var male2female "Male to Female ratio"

** 5. Settlement Structure
label var n_places "Number of localities in gridcell"
label var largest_town "Population of largest town (village)" 

save "Data\census1931_grid.dta", replace
